Library calls

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.2.1     ✔ dplyr   1.1.2
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(tsibbledata)
library(tsibble)
## 
## Attaching package: 'tsibble'
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:tsibble':
## 
##     interval
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(leaflet)
library(forcats)
library(infer)
library(slider)

Note: lubridate masks tsibble::interval and base::date/intersect/setdiff/union

theme set up

# set up color palette for citibank and lyft
lyft_pink <- "#FF00BF"
citibank_darkblue <- "#003A72" 
citibank_red <- "#DB230B"

Data types

  • start_time and stop_time are date time format - we can use these directly to extract weekdays, months, years, etc
  • start_station and stop_station are factor format - they include the id of the station locations
  • {start/end}_{lat/long} are format for mapping
  • birth_year is as a numeric - convert to datetime to calculate age at time of start_time
  • gender is and includes: Male, Female, Unknown
nyc_bikes %>% 
  distinct(gender)

classic or electric bike?

Note: no variable for bike type (classic or electric), which seems important for analysing how to increase ride business. Checking the helpfile source (https://citibikenyc.com/system-data) there is potentially newer data available for download from this website, which includes “rideable type”.

IF TIME: consider including this as an extra / easter egg at end of presentation (any other insights)

Data cleaning/wrangling

Convert start_time into date, year, month, etc

head(nyc_bikes)

start_time and stop_time are already in ISO … format (<S3: POSIXct> class) so do not need to be converted. Also, this is all about bikes in NYC so I am not converting timezone because it won’t affect any calculated data.

However, we can extract the year, month, etc from these variables, to enable analysis of hires over time as well as calculate rider age at time of hire.

nyc_bikes_wrangled <- nyc_bikes %>% 
  mutate(
    # extract elements from start_time
    start_year = year(start_time),
    start_month = month(start_time, label = TRUE, abbr = TRUE),
    start_day = day(start_time),
    start_weekday = wday(start_time, label = TRUE, abbr = TRUE),
    start_hour = hour(start_time),
    start_minute = minute(start_time),
    # make start dates and times
    start_date = make_date(year = start_year, month = start_month, day = start_day),
    start_timestamp = hm(str_c(as.character(start_hour),":",as.character(start_minute))),
    # extract same elements from stop_time
    stop_year = year(stop_time),
    stop_month = month(stop_time, label = TRUE, abbr = TRUE),
    stop_day = day(stop_time),
    stop_weekday = wday(stop_time, label = TRUE, abbr = TRUE),
    stop_hour = hour(stop_time),
    stop_minute = minute(stop_time),
    # make stop dates and times
    stop_date = make_date(year = stop_year, month = stop_month, day = stop_day),
    stop_timestamp = hm(str_c(as.character(stop_hour),":",as.character(stop_minute)))
    ) %>%
  # calculate additional variables
  mutate(
    # calculate ride duration in different units (hours, mins)
    ride_duration_period = as.period(stop_time - start_time),
    ride_duration_hours = as.numeric(ride_duration_period, "hours"),
    ride_duration_mins = as.numeric(ride_duration_period, "minutes"),
    # calculate rider age in years
    rider_age = start_year - birth_year
    )

head(nyc_bikes_wrangled)

Make a tibble for summary stats (not time series)

nyc_bikes_tibble <- as.tibble(nyc_bikes_wrangled)
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

check NAs again

nyc_bikes_tibble %>%
  summarise(across(
    .cols = everything(),
    .fns = ~ sum(is.na(.x))
  )) 

No NAs in summary data tibble.

year of data is 2018

# summary stats for overall df
nyc_bikes_tibble %>% 
  distinct(start_year)

All the data is from 2018.

inspect demographic data

# overall summary counts for user demographics

# (i) gender
nyc_bikes_tibble %>% 
  summarise(count = n(), .by = gender)
# (ii) age
nyc_bikes_tibble %>% 
  summarise(count = n(), .by = rider_age) %>% 
  arrange(rider_age)
nyc_bikes_tibble %>% 
  ggplot() +
  geom_histogram(aes(x = rider_age), colour = "white", bins = 50)

# is age 49 peak gender unknown?
nyc_bikes_tibble %>% 
  filter(rider_age == 49) %>% 
  summarise(count = n(), .by = gender)
# most of the 49-yo users have gender unknown - this is fishy, recode these rows to NAs for user demographics

Most of the 49-yo users have gender unknown - this is fishy, could be not real, so recode these rows to NA for age (other user demographics is fine, can filter out gender unknown for proportions and rider type still looks valid)

inspect ride data

## some very long durations - consider these relock failures?
nyc_bikes_tibble %>% 
  filter(ride_duration_mins > 100) %>% 
  select(ride_duration_mins, type, birth_year, gender, rider_age,
         start_time, stop_time, start_station, end_station) %>% 
  arrange(desc(ride_duration_mins))
# recode as NA any ride duration mins > 180 (3 hours) -- recodes 10 values
nyc_bikes_tibble %>% 
  filter(ride_duration_mins > 180) %>% 
  select(ride_duration_mins, type, birth_year, gender, rider_age,
         start_time, stop_time, start_station, end_station) %>% 
  arrange(desc(ride_duration_mins))

clean data

nyc_bikes_clean <- nyc_bikes_wrangled %>% 
  mutate(
        # (NOT USED) recode age to NA for 243 rows where age is 49 & gender is unknown
        # because this is an unusual peak in our age frequency data, suspect not real
        # rider_age = if_else(
        #                 rider_age == 49 & gender == "Unknown",
        #                 NA_integer_,
        #                 rider_age),
        # # recode riders over 100 to NA, 2 values of 130 and 131
        rider_age = if_else(rider_age > 100, NA_integer_, rider_age),
        # recode ride duration to NA if over 180 minutes
        # recodes 10 values, suspect not locked at a station at end of individual ride
        ride_duration_mins = if_else(
                        ride_duration_mins>180,
                        NA_integer_,
                        ride_duration_mins))
# recode age 49 to NA if gender unknown
nyc_bikes_summary <- nyc_bikes_tibble %>% 
  mutate(rider_age = if_else(
    rider_age == 49 & gender == "Unknown",
    NA_integer_,
    rider_age))

# recode as NA any ride duration mins > 180 (3 hours) -- recodes 10 values
nyc_bikes_tibble %>% 
  mutate(ride_duration_mins = if_else(
                                ride_duration_mins>180,
                                NA_integer_,
                                ride_duration_mins)) #%>% 
  # filter(ride_duration_mins > 120) %>% 
  # select(ride_duration_mins) # all > 180 have gone

Insight Qs

Homework Qs:

  1. What is the pattern of bike hires over time (e.g. within a year, month, week, or day)?
  2. Do bike hire patterns differ between bike rider demographics? (e.g. gender, type of trip, age)
  3. What is the geographical spread of the start points of bike hires?
  4. Optional: Can you predict bike use over the next couple of months? (This is based on the flipped time series forecasting lesson)
  5. Any other insights?

Other Qs:

Where do Citi Bikers ride? When do they ride? How far do they go? Which stations are most popular? What days of the week are most rides taken on? (source: https://citibikenyc.com/system-data)

For every visualisation you create, consider whether and how it can provide actionable insights to the business (in this case NYC Citi Bike, a bike share operator) and how it relates to key performance indicators (in this case the overall objective is to increase bike hires). Also, remember to include a description and interpretation of each plot so as to explain the context and insights.

1. Hire patterns over time

For ts, use nyc_bikes_clean

class(nyc_bikes_clean)
## [1] "tbl_ts"     "tbl_df"     "tbl"        "data.frame"
colnames(nyc_bikes_clean)
##  [1] "bike_id"              "start_time"           "stop_time"           
##  [4] "start_station"        "start_lat"            "start_long"          
##  [7] "end_station"          "end_lat"              "end_long"            
## [10] "type"                 "birth_year"           "gender"              
## [13] "start_year"           "start_month"          "start_day"           
## [16] "start_weekday"        "start_hour"           "start_minute"        
## [19] "start_date"           "start_timestamp"      "stop_year"           
## [22] "stop_month"           "stop_day"             "stop_weekday"        
## [25] "stop_hour"            "stop_minute"          "stop_date"           
## [28] "stop_timestamp"       "ride_duration_period" "ride_duration_hours" 
## [31] "ride_duration_mins"   "rider_age"

all hires (start_date)

nyc_bikes_clean %>%
  index_by(start_time) %>% 
  summarise(freq = n()) %>% 
  ggplot() + 
  geom_col(aes(x = start_time, y = freq))

month

nyc_bikes_clean %>%
  index_by(start_month) %>% 
  summarise(freq = n()) %>% 
  ggplot() + 
  geom_col(aes(x = start_month, y = freq))

August and July are most popular months, for both customers and subscribers:

month ~ type

nyc_bikes_summary %>% 
  ggplot() + 
  aes(x = start_month, fill = type) +
  geom_histogram(stat = "count", position = "dodge")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

Potentially target visitors in summer <- get day passes for tourists?

Look at visitor start locations, stop locations, durations at different times of year to hone in on this.

weekday

nyc_bikes_clean %>%
  index_by(start_weekday) %>% 
  summarise(freq = n()) %>% 
  ggplot() + 
  geom_col(aes(x = start_weekday, y = freq))

#### weekday ~ type

nyc_bikes_summary %>% 
  ggplot() + 
  aes(x = start_weekday, y = after_stat("count"), fill = type) +
  geom_col() +
  facet_wrap(~type, ncol=1)
## Warning in fun(x, ...): NAs introduced by coercion

## Warning in fun(x, ...): NAs introduced by coercion

## Warning in fun(x, ...): NAs introduced by coercion

## Warning in fun(x, ...): NAs introduced by coercion

## Warning in fun(x, ...): NAs introduced by coercion

## Warning in fun(x, ...): NAs introduced by coercion

## Warning in fun(x, ...): NAs introduced by coercion

## Warning in fun(x, ...): NAs introduced by coercion

## Warning in fun(x, ...): NAs introduced by coercion

## Warning in fun(x, ...): NAs introduced by coercion

## Warning in fun(x, ...): NAs introduced by coercion

## Warning in fun(x, ...): NAs introduced by coercion

## Warning in fun(x, ...): NAs introduced by coercion

## Warning in fun(x, ...): NAs introduced by coercion
## Warning: Removed 4268 rows containing missing values (geom_col).

Customers tend to use bikes more on weekends, whereas subscribers use them during the working week (Monday - Friday)

day (date)

nyc_bikes_clean %>%
  index_by(start_day) %>% 
  summarise(freq = n()) %>% 
  ggplot() + 
  geom_col(aes(x = start_day, y = freq))

Just looks like the weekly pattern (higher use on weekdays, driven by subcribers) - but note date number does not correspond to weekday so could be missing this

day ~ type

nyc_bikes_summary %>% 
  ggplot() + 
  aes(x = start_day, fill = type) +
  geom_histogram(stat = "count") +
  facet_wrap(~type, ncol=1)
## Warning: Ignoring unknown parameters: binwidth, bins, pad

start_date

hire_freq_rolling <- nyc_bikes_clean %>% 
  index_by(start_date) %>% 
  summarise(freq = n()) %>%
  mutate(freq_moving_avg = slide_dbl(
      .x = freq, 
      .f = ~ mean(., na.rm = TRUE),
      .before = 14,
      .after = 14,
      .complete = TRUE
    ))

hire_freq_rolling %>%
  ggplot() + 
  geom_line(aes(x = start_date, y = freq), colour = citibank_darkblue, alpha = 0.5) +
  geom_line(aes(x = start_date, y = freq_moving_avg), colour = lyft_pink, size = 2)
## Warning: Removed 28 row(s) containing missing values (geom_path).

start_hour

start_timestamp doesn’t work

nyc_bikes_clean %>%
  index_by(as.numeric(start_timestamp)) %>% 
  summarise(freq = n()) %>% 
  ggplot() + 
  geom_col(aes(x = start_timestamp, y = freq))

so use hour to show frequency of when ride starts

nyc_bikes_clean %>%
  index_by(start_hour) %>% 
  # summarise(freq = n()) %>% 
  ggplot() + 
  geom_histogram(aes(x = start_hour, fill = type), bins = 24, colour = "white", line = 0.5)
## Warning: Ignoring unknown parameters: line

  #geom_col(aes(x = start_hour, y = freq))

Peak usage is at 8-9am and 5-7pm i.e. commuter times

[ ] drilldown into start hour

separate into weekdays vs weekends; facet by type;

duration

duration (overall)

# ride duration
## note: cleaning step to recode (10) rides above 180 mins as NA

# data distribution for rides 60 mins or less
## note cleaning step has recoded 10 values about 180 to NA
## only 43 values higher than 60 remain
nyc_bikes_summary %>% 
  # filter(ride_duration_mins <= 180 & ride_duration_mins > 60) # 43 values
  filter(ride_duration_mins <= 60) %>% 
  ggplot(aes(x = ride_duration_mins)) +
  geom_histogram(bins = 100, colour = "white")

##  skewed data - test for skewness

nyc_bikes_summary %>% 
  filter(ride_duration_mins <= 60) %>% 
  ggplot(aes(x = ride_duration_mins)) +
  geom_boxplot()

# test for skewness
nyc_bikes_summary %>% 
  filter(ride_duration_mins <= 60) %>% 
  pull(ride_duration_mins) %>% 
  e1071::skewness(ride_duration_mins, type = 2)
## [1] 2.813439
# skewness: 2.813439
# moderately positively (right) skewed

# 5 number summary
nyc_bikes_summary %>% 
  filter(ride_duration_mins <= 60) %>% 
  select(ride_duration_mins) %>% 
  summary()
##  ride_duration_mins
##  Min.   : 1.018    
##  1st Qu.: 3.879    
##  Median : 5.790    
##  Mean   : 8.312    
##  3rd Qu.: 9.471    
##  Max.   :59.695
# ride_duration_mins
#  Min.   : 1.018    
#  1st Qu.: 3.879    
#  Median : 5.790    
#  Mean   : 8.312    
#  3rd Qu.: 9.471    
#  Max.   :59.695

## also use infer workflow to get CI so can report with median
null_dist_duration <- nyc_bikes_summary %>% 
  filter(ride_duration_mins <= 60) %>% 
  specify(response = ride_duration_mins) %>% 
  generate(reps = 10000, type = "bootstrap") %>% 
  calculate(stat = "median")

null_dist_duration %>% 
  get_ci(level = 0.95, type = "percentile")
## For rides of duration 1 hour or less, the median ride duration is 5.79 mins with 95% CI [5.67, 5.93]
# proportion rides less than 10 mins
nyc_bikes_summary %>% 
  filter(ride_duration_mins <= 10) %>% 
  summarise(prop = n()/nrow(nyc_bikes_summary))

ts duration

look at ride duration at different times

nyc_bikes_clean %>%
  index_by(start_date) %>% 
  ggplot() + 
  geom_line(aes(x = start_date, y = ride_duration_mins))

ts by another factor

https://feasts.tidyverts.org/

# a tsibble needs index and key
freqs_by_gender <- nyc_bikes_clean %>% 
  #update_tsibble(key = c(bike_id, gender)) %>% 
  index_by(start_date) %>%
  # can group_by variables (any) after indexing
  group_by(gender) %>%  
  summarise(freq = n()) %>% 
  mutate(freq_moving_avg = slide_dbl(
      .x = freq,
      .f = ~ mean(., na.rm = TRUE),
      .before = 10,
      .after = 10,
      .complete = TRUE
    ))
freqs_by_gender %>% 
  ggplot() +
  geom_line(aes(x = start_date, y = freq, colour = gender)) +
  scale_colour_manual(values = c(citibank_darkblue, citibank_red, lyft_pink)) +
  geom_line(aes(x = start_date, y = freq_moving_avg, colour = gender), size = 0.5) +
  labs(x = "\nStart date", y = "Number of hires\n",
       title = "Bike hires by gender\n") +
  theme_minimal() +
  theme(panel.grid = element_blank(),
        title = element_text(face = "bold"))
## Warning: Removed 20 row(s) containing missing values (geom_path).

freqs_by_type <- nyc_bikes_clean %>% 
  #update_tsibble(key = c(bike_id, gender)) %>% 
  index_by(start_date) %>%
  # can group_by variables (any) after indexing
  group_by(type) %>%  
  summarise(freq = n()) %>% 
  mutate(freq_moving_avg = slide_dbl(
      .x = freq,
      .f = ~ mean(., na.rm = TRUE),
      .before = 10,
      .after = 10,
      .complete = TRUE
    ))

freqs_by_type %>% 
  ggplot() +
  geom_line(aes(x = start_date, y = freq, colour = type), alpha = 0.5) +
  scale_colour_manual(values = c(lyft_pink, citibank_darkblue)) +
  geom_line(aes(x = start_date, y = freq_moving_avg, colour = type), size = 2) +
  labs(x = "\nStart date", y = "Number of hires\n",
       title = "Bike hires by user type\n") +
  theme_minimal() +
  theme(panel.grid = element_blank(),
        title = element_text(face = "bold"))
## Warning: Removed 20 row(s) containing missing values (geom_path).

ts_hire_freq_males <- nyc_bikes_clean %>% 
  filter(gender == "Male") %>% 
  index_by(start_date) %>% 
  summarise(freq = n()) %>%
  mutate(freq_moving_avg = slide_dbl(
      .x = freq, 
      .f = ~ mean(., na.rm = TRUE),
      .before = 10,
      .after = 10,
      .complete = TRUE
    ),
    gender = "Male")

ts_hire_freq_females <- nyc_bikes_clean %>% 
  filter(gender == "Female") %>% 
  index_by(start_date) %>% 
  summarise(freq = n()) %>%
  mutate(freq_moving_avg = slide_dbl(
      .x = freq, 
      .f = ~ mean(., na.rm = TRUE),
      .before = 10,
      .after = 10,
      .complete = TRUE
    ),
    gender = "Female")


ts_hire_freq_gender_uk <- nyc_bikes_clean %>% 
  filter(gender == "Unknown") %>% 
  index_by(start_date) %>% 
  summarise(freq = n()) %>%
  mutate(freq_moving_avg = slide_dbl(
      .x = freq, 
      .f = ~ mean(., na.rm = TRUE),
      .before = 10,
      .after = 10,
      .complete = TRUE
    ),
    gender = "Unknown")
ts_hire_freq_males %>%
  ggplot() + 
  geom_line(aes(x = start_date, y = freq), colour = citibank_red, alpha = 0.5) +
  geom_line(aes(x = start_date, y = freq_moving_avg), colour = lyft_pink, size = 2) +
  labs(x = "\nStart date", y = "Number of hires\n",
       title = "Bike hires by male users\n") +
  theme_minimal() +
  theme(panel.grid = element_blank(),
        title = element_text(face = "bold"))
## Warning: Removed 20 row(s) containing missing values (geom_path).

ts_hire_freq_females %>%
  ggplot() + 
  geom_line(aes(x = start_date, y = freq), colour = citibank_darkblue, alpha = 0.5) +
  geom_line(aes(x = start_date, y = freq_moving_avg), colour = lyft_pink, size = 2) +
  labs(x = "\nStart date", y = "Number of hires\n",
       title = "Bike hires by female users\n") +
  theme_minimal() +
  theme(panel.grid = element_blank(),
        title = element_text(face = "bold"))
## Warning: Removed 20 row(s) containing missing values (geom_path).

hire_freq_rolling <- nyc_bikes_clean %>% 
  index_by(start_date) %>% 
  summarise(freq = n()) %>%
  mutate(freq_moving_avg = slide_dbl(
      .x = freq, 
      .f = ~ mean(., na.rm = TRUE),
      .before = 10,
      .after = 10,
      .complete = TRUE
    ))

hire_freq_rolling %>%
  ggplot() + 
  geom_line(aes(x = start_date, y = freq), alpha = 0.5) +
  geom_line(aes(x = start_date, y = freq_moving_avg), colour = lyft_pink, size = 2) +
  labs(x = "\nStart date", y = "Number of hires\n",
       title = "Bike hires throughout the year\n") +
  theme_minimal() +
  theme(panel.grid = element_blank(),
        title = element_text(face = "bold"))
## Warning: Removed 20 row(s) containing missing values (geom_path).

2. Hires by demographics

(see above)

# overall summary counts for user demographics

# (i) gender
nyc_bikes_summary %>% 
  summarise(count = n(), .by = gender)
# (ii) age
nyc_bikes_summary %>% 
  summarise(count = n(), .by = rider_age) %>% 
  arrange(rider_age)
# age buckets by years
nyc_bikes_summary %>% 
  mutate(rider_age_bucket = case_when(
    rider_age <= 35 ~ "19-35",
    rider_age <= 50 ~ "36-50",
    .default = "51 and above"
  )) %>% 
  summarise(count = n(), .by = rider_age_bucket)
# age buckets to even out size of buckets
nyc_bikes_summary %>% 
  mutate(rider_age_bucket = case_when(
    rider_age <= 25 ~ "19-25",
    rider_age <= 30 ~ "26-30",
    rider_age <= 35 ~ "31-35",
    rider_age <= 40 ~ "36-40",
    rider_age <= 50 ~ "41-50",
    .default = "51 and above"
  )) %>% 
  summarise(count = n(), .by = rider_age_bucket) %>% 
  arrange(rider_age_bucket)
nyc_bikes_summary %>% 
  ggplot() +
  geom_histogram(aes(x = rider_age), colour = "white", bins = 50)
## Warning: Removed 243 rows containing non-finite values (stat_bin).

# (iii) user type
nyc_bikes_summary %>%
  group_by(type) %>% 
  summarise(count = n(),
            prop = count/nrow(nyc_bikes_summary))
# (iv) type by gender
nyc_bikes_summary %>% 
  ggplot() +
  geom_bar(aes(x = gender, fill = type))

# (v) type by age
nyc_bikes_summary %>% 
  ggplot() +
  geom_bar(aes(x = rider_age, fill = type), show.legend = FALSE) +
  facet_wrap( ~ type, ncol = 1)
## Warning: Removed 243 rows containing non-finite values (stat_count).

nyc_bikes_summary %>% 
  ggplot() +
  geom_boxplot(aes(x = rider_age, fill = type))
## Warning: Removed 243 rows containing non-finite values (stat_boxplot).

# (vi) gender by age
nyc_bikes_summary %>% 
  ggplot() +
  geom_bar(aes(x = rider_age, fill = gender), show.legend = FALSE) +
  facet_wrap( ~ gender, ncol = 1)
## Warning: Removed 243 rows containing non-finite values (stat_count).

  • Rider age 26-35 is most frequent user demographics
  • 92.6% of hires are made by subscribers; the customer segment is small

duration ~ demographic

duration ~ gender
# ride duration by gender
nyc_bikes_summary %>% 
  filter(ride_duration_mins <= 60) %>% 
  filter(gender != "Unknown") %>% 
  ggplot(aes(x = ride_duration_mins, fill = gender)) +
  geom_boxplot()

## doesn't look like there is a difference in duration by gender
** duration ~ type
# ride duration by hire type
nyc_bikes_summary %>% 
  filter(ride_duration_mins <= 60) %>% 
  ggplot(aes(x = ride_duration_mins, fill = type)) +
  geom_boxplot()

## looks like subscribers take longer bike rides - test this

2023 website says memberships can hire for 45 minutes then 17c/min charge applies (https://citibikenyc.com/pricing/annual; regardless of bike type); passes hire for 30 mins then 26c/min charge applies (or additional $4 for every 15 mins over - https://citibikenyc.com/how-it-works/bike-rental-nyc).

Might be interesting to calculate income from additional minutes for each type.

*duration ~ age
# rider_age or gender or type ~ ride_duration (time in mins)
nyc_bikes_summary %>% 
  filter(rider_age < 100,
         ride_duration_mins <= 60) %>% 
  ggplot() +
  aes(x = rider_age, y = ride_duration_mins) +
  geom_point()

## no discernible pattern
## could bucket the duration to see if any differences (e.g. fewer older people take longer rides)

3. Geospatial mapping

library(leaflet) # in library calls at top of report

Start stations (frequency)

Q: What is the geographical spread of the start points of bike hires?

# using nyc_bikes_wrangled
# summary table of start locations by count of unlocks (n) and proportion (% total)
start_freq <- nyc_bikes_summary %>% 
  count(start_station, start_long, start_lat) %>% 
  mutate(popup = str_c("Station: ", start_station,
                       "; unlock count: ", n),
         prop_perc = 100*(n/sum(n))) %>% 
  arrange(desc(prop_perc))

# make df of to 20 most used start stations
start_freq_top20 <- start_freq %>%
  head(20)

# make vector of top 20 most used start station IDs 
start_freq_top20_names <- start_freq_top20 %>% 
  pull(start_station)
# histogram of all start locations coloured by hire frequency
start_freq %>% 
  ggplot() +
  geom_col(aes(x = n, y = reorder(start_station, n))) +
  theme_minimal() +
  labs(x = "Number of unlocks", y = "Start station") +
  theme(axis.text.y = element_text(size = 6))

# histogram of top 20 locations
start_freq %>% 
  slice_max(n, n = 20) %>% 
  ggplot() +
  geom_col(aes(x = n, y = reorder(start_station, n))) +
  theme_minimal() +
  labs(x = "Number of unlocks", y = "Start station\n") +
  theme(axis.text.y = element_text(size = 6))

starts frequency map

# INCLUDE:
# start_freq <- nyc_bikes_summary %>% 
#   count(start_station, start_long, start_lat) %>% 
#   mutate(popup = str_c("Station: ", start_station,
#                        "; unlock count: ", n),
#          prop_perc = 100*(n/sum(n))) %>% 
#   arrange(desc(prop_perc))

# function to colour marker by count (heat intensity)
palette <- colorNumeric(
  palette = "plasma",
  domain = start_freq$n
)

leaflet(start_freq) %>% 
  addTiles() %>% 
  addCircleMarkers(
    lng = ~start_long,
    lat = ~start_lat,
    popup = ~ popup,
    stroke = TRUE,
    color = "black",
    opacity = 1,
    weight = 3,
    fillColor = ~ palette(n),
    fillOpacity = 1,
    radius = 8
  )

In 2018, Station 3186 had the most unlocks, busiest pick-up station.

3195 as central coordinates

nyc_bikes_summary %>% 
  filter(start_station == 3195) %>% 
  head(1) %>% 
  select(start_station, start_lat, start_long)

start ~ type of user

starts_by_type <- nyc_bikes_summary %>% 
  group_by(as.character(start_station), as.character(type)) %>% 
  summarise(unlocks = n())
## `summarise()` has grouped output by 'as.character(start_station)'. You can
## override using the `.groups` argument.

Use forcats to manipulate factor data and reorder bar chart:

library(forcats)
nyc_bikes_summary %>% 
  # filter for top 20 most used start stations
  filter(start_station %in% start_freq_top20_names) %>% 
  ggplot() +
  geom_bar(aes(y = fct_rev(fct_infreq(start_station)), fill = as.character(type)))

[ ] duration ~ start location

Stop stations (frequency)

not explored